Client Report - Coding Challenge 2

Coding Challenge 2

Author

Ezekial Curran

Show the code
import polars as pl
import numpy as np
from lets_plot import *
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import (
  classification_report, 
  accuracy_score, 
  recall_score, 
  precision_score, 
  f1_score
  )

LetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URL
url = "https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv"
url2 = "https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv"
df = pl.read_csv(url, infer_schema_length=100000)
df2 = pl.read_csv(url2)

QUESTION 1

  1. Recreate an image of the history of Peter in Utah and Oregon:

Titles (chart and axis) Axis (same data types) Lines (colors) Annotation lines and text

Show the code
# Include and execute your code here
df_peter = df.filter(pl.col('name').is_in(["Peter"])).select(["name", "year", "UT", "OR"])
peter_graph = (
    ggplot(data=df_peter)
    + geom_line(mapping = aes(x = 'year', y = 'UT'), color='red')
    + geom_line(mapping = aes(x = 'year', y = 'OR'), color='orange')
    + scale_x_continuous(limits=[1910,2025], format = 'd', expand=[0, 0])
    + scale_y_continuous(format='d')
    + labs(
        title="The history of Peter for Utah (red) and Oregon (orange)",
        x='Year name given',
        y='Count of Peter'
    )
    + geom_text(x = 1936, y = 50, label="1936", hjust="right", size=6, nudge_x= -2)
    + geom_text(x = 1972, y = 85, label="1972", hjust="right", size=6, nudge_x= -2)
    + geom_text(x = 2005, y = 75, label="2005", hjust="right", size=6, nudge_x= -2)
    + geom_vline(xintercept=1936, color="black", linetype="solid")
    + geom_vline(xintercept=1972, color="black", linetype="solid")
    + geom_vline(xintercept=2005, color="black", linetype="solid")
    + ggsize(1600, 900)
)
display(peter_graph)

sv = ggsave(peter_graph, filename="peters.svg", path="./plots/")

Question 2

  1. Calculate the mean after replacing missing values with the standard deviation.
Show the code
# Include and execute your code here
problem = pl.Series([np.nan, 18, 22, 45, 31, np.nan, 85, 38, 129, 8000, 22, 2])
stdev = problem.filter(problem.is_not_nan()).std()
problem = problem.fill_nan(stdev)
mean = problem.mean()
print(f"Mean found: {mean:.2f} from a standard deviation of: {stdev:.2f}")
Mean found: 1118.72 from a standard deviation of: 2516.33

QUESTION 3

  1. Split the age ranges into a data frame with two columns and re-create a chart

Titles (axis)

Describe response.

Show the code
# Include and execute your code here
ages = pl.Series(["10-25", "10-25", "26-35", "56-85", "0-9", "46-55",
"56-85", "0-9", "26-35", "56-85", "0-9", "10-25"])

cnts = ages.value_counts().sort('')
new_ages = cnts.with_columns(
  pl.when(pl.col('') == "56-85")
  .then(0)
  .otherwise(pl.col('count'))
  .alias('young')
).with_columns(
  pl.when(pl.col('') == "0-9")
  .then(0)
  .otherwise(pl.col('count'))
  .alias('old')
)
new_ages = new_ages.rename({'': 'ages'})
box_df = new_ages.select(['young', 'old']).unpivot()

box_plt = (
  ggplot(data=box_df, mapping=aes(x="variable", y="value")) 
  + geom_boxplot()
)

display(box_plt)

sv = ggsave(box_plt, filename="box.svg", path="./plots/")

Question 4

  1. Use the dwellings ml data to complete the following:

Predict if a home is more than two bathrooms Use test_size = .30 and random_state = 2021 in train_test_split() Use the RandomForestClassifier() method Report your accuracy and a feature importance plot with the top 10 most important features

Describe response.

Show the code
# Include and execute your code here